Pass Task- 3¶

In [1]:
import numpy as np
import nbconvert
import warnings
warnings.simplefilter(action='ignore', category=Warning)
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report,precision_score,recall_score
from sklearn.svm import SVC,SVR
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.manifold import TSNE
from feature_engine.selection import DropConstantFeatures,DropCorrelatedFeatures,DropDuplicateFeatures,SmartCorrelatedSelection
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,f1_score,plot_confusion_matrix,roc_auc_score
warnings.filterwarnings(action='ignore')
from sklearn.tree import DecisionTreeClassifier

1 Load "digits" datasets from SKlearn. Classify digit classes using KNN. Use the same data splitting and performance metrics that you have used in previous week (week 4). Report your findings including comparison of results with week 4¶

In [2]:
# Loading the digits dataset
digits = datasets.load_digits(as_frame=True)
df = digits['frame']
In [3]:
print(f"Shape of the dataset:{df.shape}")
Shape of the dataset:(1797, 65)
In [4]:
# Creating a function to create classifcation model and return metrics

def create_classification_model(model_name,model,X_train,X_test,y_train,y_test):
    
    model.fit(X_train,y_train)
    y_pred_probs = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    
    if y_train.nunique() > 2:
        print(f"Training_accuracy_score: {accuracy_score(y_train,model.predict(X_train))}")
        print(f"Testing_accuracy_score: {accuracy_score(y_test,y_pred)}")
        print(f"f1_score: {f1_score(y_test,y_pred,average='weighted')}")
        print(f"roc_auc_score: {roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')}")
        print("\n---------Classification report----------\n")
        print(classification_report(y_test,y_pred))
        # classification_report = classification_report(y_test,y_pred)

        df = pd.DataFrame({'Model':[model_name],
                           'Training_accuracy_score':[accuracy_score(y_train,model.predict(X_train))],
                           'Testing_accuracy_score':[accuracy_score(y_test,y_pred)],
                           'roc_auc_score':[roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')],
                            'f1_score':[f1_score(y_test,y_pred,average='weighted')]})
        return df,model
    else:
        print(f"Training_accuracy_score: {accuracy_score(y_train,model.predict(X_train))}")
        print(f"Testing_accuracy_score: {accuracy_score(y_test,y_pred)}")
        print(f"f1_score: {f1_score(y_test,y_pred)}")
        print(f"roc_auc_score: {roc_auc_score(y_test,model.predict_proba(X_test)[:,1])}")
        print("\n---------Classification report----------\n")
        print(classification_report(y_test,y_pred))
        # classification_report = classification_report(y_test,y_pred)


        df = pd.DataFrame({'Model':[model_name],
                           'Training_accuracy_score':[accuracy_score(y_train,model.predict(X_train))],
                           'Testing_accuracy_score':[accuracy_score(y_test,y_pred)],
                           'roc_auc_score':[roc_auc_score(y_test,model.predict_proba(X_test)[:,1])],
                            'f1_score':[f1_score(y_test,y_pred)]})
        return df,model


In [5]:
# Creating data for training the model
X = df.drop(columns=['target'])
y = df['target']
In [6]:
# Use the same data splitting as did for week 4 using SVM
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)
In [7]:
# Stabdardising the data
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
In [8]:
# checking the optimal value of K by visualising overfit and underfit of data
test_err = []
train_err = []
neighbors = range(1,20)
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(scaled_X_train,y_train)
    y_pred = knn.predict(scaled_X_test)
    train_acc = accuracy_score(y_train,knn.predict(scaled_X_train))
    test_acc = accuracy_score(y_test,y_pred)
    test_err.append(1 - test_acc)
    train_err.append(1 - train_acc)

plt.plot(neighbors,test_err,label = 'test_err',marker = '*')
plt.plot(neighbors,train_err,label = 'train_err',marker = '*')
plt.vlines(x = 13,ymin=0,ymax=0.06)
plt.ylabel('Error')
plt.xlabel('Number_of_neighbors')
plt.legend()
Out[8]:
<matplotlib.legend.Legend at 0x2283bf54fd0>

from above plot , it is likely to have k (number of neighbors) = 13

In [9]:
# training a default knn model
model_knn = KNeighborsClassifier(n_neighbors=13)

result_knn_df_eucledian,model_knn = create_classification_model('KnearestNeighbor_eucledian',model_knn,
                                                  scaled_X_train,
                                                  scaled_X_test,
                                                  y_train,
                                                  y_test)
Training_accuracy_score: 0.9684123025768911
Testing_accuracy_score: 0.968013468013468
f1_score: 0.9679731865847384
roc_auc_score: 0.9962960228882123

---------Classification report----------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        59
           1       0.91      0.97      0.94        60
           2       0.98      0.98      0.98        59
           3       0.98      0.98      0.98        60
           4       1.00      0.95      0.97        60
           5       0.97      0.97      0.97        60
           6       0.97      1.00      0.98        60
           7       0.95      0.98      0.97        59
           8       0.96      0.90      0.93        58
           9       0.97      0.95      0.96        59

    accuracy                           0.97       594
   macro avg       0.97      0.97      0.97       594
weighted avg       0.97      0.97      0.97       594

From above report and metrics,we have a decent accuracy as well as good recall score of every class

Now comparing the above results from KNN with below week 4 SVM

In [10]:
# training the digits dataset with week 4 SVM  
digits = datasets.load_digits(as_frame=True)
digits_df = digits['frame']

def create_classification_model_4(model_name,model,X_train,X_test,y_train,y_test):
    
    model.fit(X_train,y_train)
    y_pred_probs = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    
    # if model_type == 1:
    print(f"Training_score: {model.score(X_train,y_train)}")
    print(f"Testing_score: {model.score(X_test,y_test)}")
    print(f"f1_score: {f1_score(y_test,y_pred,average='weighted')}")
    print(f"roc_auc_score: {roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')}")
    print("\n---------Classification report----------\n")
    print(classification_report(y_test,y_pred))

    
    df = pd.DataFrame({'Model':[model_name],
                       'Training_score':[model.score(X_train,y_train)],
                       'Testing_score':[model.score(X_test,y_test)],
                       'roc_auc_score':[roc_auc_score(y_test,model.predict_proba(X_test),multi_class='ovr')],
                        'f1_score':[f1_score(y_test,y_pred,average='weighted')]})
    return df,model

# Extrating the column features pixels from the dataset
digits_df_predictors = digits_df.iloc[:,:-1]
digits_df_predictors

# standardising the datset
scaler = StandardScaler()
scaled_digits_df_predictors = scaler.fit_transform(digits_df_predictors)

#applying PCA on the whole dataset predictors
pca_digits = PCA()
pca_transformed_digits_df_predictors = pca_digits.fit_transform(scaled_digits_df_predictors)
pca_transformed_digits_df_predictors


# Taking first 3 components
pca_transformed_digits_df_predictors_df = pd.DataFrame(pca_transformed_digits_df_predictors[:,[0,1,2]],
                                                       columns=['pc1','pc2','pc3'])

pca_transformed_digits_df_full = pd.concat([pca_transformed_digits_df_predictors_df,digits_df.iloc[:,-1]],axis = 1)
# pca_transformed_digits_df_full


X = pca_transformed_digits_df_full.iloc[:,0:3]
y = pca_transformed_digits_df_full['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


svc = SVC(probability=True,kernel='rbf',C=3.1,gamma=0.1)

# param_grid = {"C":np.arange(start=0.1, stop=5, step=0.1),
#              # "degree":[2,3,4],
#              'gamma' : [1,0.1,0.01,0.001]}


# kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
# grid = GridSearchCV(estimator=svc,param_grid=param_grid,
#                     cv = kf,return_train_score=True,verbose=1)
SVC_df,grid_svc = create_classification_model_4('SVC',svc,X_train,X_test,y_train,y_test)
Training_score: 0.8003182179793158
Testing_score: 0.7462962962962963
f1_score: 0.7357826318887575
roc_auc_score: 0.9628497633083247

---------Classification report----------

              precision    recall  f1-score   support

           0       0.89      0.92      0.91        53
           1       0.79      0.82      0.80        50
           2       0.85      0.85      0.85        47
           3       0.73      0.69      0.70        54
           4       0.95      0.95      0.95        60
           5       0.42      0.20      0.27        66
           6       1.00      0.92      0.96        53
           7       0.77      0.84      0.80        55
           8       0.44      0.63      0.52        43
           9       0.59      0.75      0.66        59

    accuracy                           0.75       540
   macro avg       0.74      0.76      0.74       540
weighted avg       0.74      0.75      0.74       540

As we see, SVM did not preform well on overall classes as some of the classes have poor recall and precision score
whereas KNN performs decent on this dataset
Training and testing score of KNN is much better than training and testing score of SVM

In [11]:
SVC_df.rename(columns={'Training_score':'Training_accuracy_score','Testing_score':'Testing_accuracy_score'},
             inplace=True)
In [12]:
results = result_knn_df_eucledian.append(SVC_df)
results
Out[12]:
Model Training_accuracy_score Testing_accuracy_score roc_auc_score f1_score
0 KnearestNeighbor_eucledian 0.968412 0.968013 0.996296 0.967973
0 SVC 0.800318 0.746296 0.962850 0.735783


2. Create digits classification model using DT algorithm using 50-50% and 70-30% data splitting methods. Compare performances of these two models and explain the impact of difference in data splitting on the performances of the model.¶

In [13]:
# Loading the digits dataset
digits = datasets.load_digits(as_frame=True)
df = digits['frame']

# Creating data for training the model
X = df.drop(columns=['target'])
y = df['target']

# Splitting the dataset into 50% train and 50% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,stratify=y)

print(f"Shapa of X_train:{X_train.shape}")
print(f"Shapa of y_train:{y_train.shape}")
print(f"Shapa of X_test:{X_test.shape}")
print(f"Shapa of y_test:{y_test.shape}")

# Checking the optimal value for depth of tree by visualizing overfitting and underfitting
test_50_50_err = []
train_50_50_err = []
depth_tree = range(1,20)
for depth in depth_tree:
    dt = DecisionTreeClassifier(max_depth=depth,random_state=1)
    dt.fit(X_train,y_train)
    y_pred = dt.predict(X_test)
    train_acc = accuracy_score(y_train,dt.predict(X_train))
    test_acc = accuracy_score(y_test,y_pred)
    test_50_50_err.append(1-test_acc)
    train_50_50_err.append(1-train_acc)

    
# Plotting the above underfitting and overfitting
plt.plot(depth_tree,test_50_50_err,label = 'test_50_50_err')
plt.plot(depth_tree,train_50_50_err,label = 'train_50_50_err')
plt.ylabel("Error")
plt.xlabel("depth of tree")
plt.title("Error vs Depth of tree using 50-50 splitting")
plt.legend();
Shapa of X_train:(898, 64)
Shapa of y_train:(898,)
Shapa of X_test:(899, 64)
Shapa of y_test:(899,)
In [14]:
# Splitting the dataset into 70% train and 50% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

print(f"Shapa of X_train:{X_train.shape}")
print(f"Shapa of y_train:{y_train.shape}")
print(f"Shapa of X_test:{X_test.shape}")
print(f"Shapa of y_test:{y_test.shape}")

test_70_30_err = []
train_70_30_err = []
depth_tree = range(1,20)
for depth in depth_tree:
    dt = DecisionTreeClassifier(max_depth=depth,random_state=1)
    dt.fit(X_train,y_train)
    y_pred = dt.predict(X_test)
    train_acc = accuracy_score(y_train,dt.predict(X_train))
    test_acc = accuracy_score(y_test,y_pred)
    test_70_30_err.append(1-test_acc)
    train_70_30_err.append(1-train_acc)

plt.plot(depth_tree,test_70_30_err,label = 'test_70_30_err')
plt.plot(depth_tree,train_70_30_err,label = 'train_70_30_err')
plt.ylabel("Error")
plt.xlabel("depth of tree using 70-30 splitting")
plt.title("Error vs Depth of tree")
plt.legend();
Shapa of X_train:(1257, 64)
Shapa of y_train:(1257,)
Shapa of X_test:(540, 64)
Shapa of y_test:(540,)
In [15]:
# comparing train/test error vs depth of tree with different train/test split data
plt.plot(depth_tree,test_50_50_err,label = 'test_50_50_err')
plt.plot(depth_tree,train_50_50_err,label = 'train_50_50_err')
plt.plot(depth_tree,test_70_30_err,label = 'test_70_30_err')
plt.plot(depth_tree,train_70_30_err,label = 'train_70_30_err')
plt.vlines(x = 7,ymax=0.9,ymin = 0)
plt.ylabel("Error")
plt.xlabel("depth of tree")
plt.title("Error vs Depth of tree for 50-50 & 70-30 split")
plt.legend();
# plt.legend()

from above graphs,it seems that using both the splitting technique (50-50 & 70-30)->depth_of_tree comes out to be 7

In [16]:
# training a DT algorithm using depth = 7 on 50-50 train-test split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,stratify=y)

print("----Recorded Metrics using DT on 50-50 train-test split---\n")
dt = DecisionTreeClassifier(max_depth=7)
result_df_50_50,model_dt = create_classification_model('DecisionTree_50_50',dt,
                                                  X_train,
                                                  X_test,
                                                  y_train,
                                                  y_test)
----Recorded Metrics using DT on 50-50 train-test split---

Training_accuracy_score: 0.9020044543429844
Testing_accuracy_score: 0.8242491657397107
f1_score: 0.8249019275363464
roc_auc_score: 0.9251636635275006

---------Classification report----------

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        89
           1       0.71      0.78      0.74        91
           2       0.77      0.78      0.78        88
           3       0.84      0.71      0.77        92
           4       0.84      0.84      0.84        91
           5       0.91      0.87      0.89        91
           6       0.95      0.95      0.95        91
           7       0.80      0.89      0.84        89
           8       0.81      0.74      0.77        87
           9       0.68      0.76      0.72        90

    accuracy                           0.82       899
   macro avg       0.83      0.82      0.82       899
weighted avg       0.83      0.82      0.82       899

In [17]:
result_df_50_50
Out[17]:
Model Training_accuracy_score Testing_accuracy_score roc_auc_score f1_score
0 DecisionTree_50_50 0.902004 0.824249 0.925164 0.824902
In [18]:
# training a DT algorithm using depth = 7 on 70-30  train-test split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

print("----Recorded Metrics using DT on 70-30 train-test split---\n")


dt = DecisionTreeClassifier(max_depth=7)
result_df_70_30,model_dt = create_classification_model('DecisionTree_70_30',dt,
                                                  X_train,
                                                  X_test,
                                                  y_train,
                                                  y_test)
----Recorded Metrics using DT on 70-30 train-test split---

Training_accuracy_score: 0.9108989657915673
Testing_accuracy_score: 0.8203703703703704
f1_score: 0.8216944835965356
roc_auc_score: 0.9266031372940701

---------Classification report----------

              precision    recall  f1-score   support

           0       0.96      0.91      0.93        54
           1       0.73      0.73      0.73        55
           2       0.80      0.77      0.79        53
           3       0.85      0.84      0.84        55
           4       0.79      0.85      0.82        54
           5       0.92      0.87      0.90        55
           6       0.98      0.94      0.96        54
           7       0.80      0.83      0.82        54
           8       0.66      0.75      0.70        52
           9       0.73      0.70      0.72        54

    accuracy                           0.82       540
   macro avg       0.82      0.82      0.82       540
weighted avg       0.82      0.82      0.82       540

In [19]:
df = result_df_50_50.append(result_df_70_30)
df
Out[19]:
Model Training_accuracy_score Testing_accuracy_score roc_auc_score f1_score
0 DecisionTree_50_50 0.902004 0.824249 0.925164 0.824902
0 DecisionTree_70_30 0.910899 0.820370 0.926603 0.821694

So,from above comparison using dataframe ::--
By using greater data for training we can achieve greater training score
70-30 split and 50-50 split achieved similar metrics in testing, roc_auc & f1_score


3.Create two more KNN-based classification models using the dataset used in Q1 by varying distance metrics such as using cityblock and cosine. Report the performances of the developed models including Q1 and explain the similarity or differences if any¶

In [20]:
# Loading the digits dataset
digits = datasets.load_digits(as_frame=True)
df = digits['frame']

# Creating data for training the model
X = df.drop(columns=['target'])
y = df['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

# Creating knn using cosine as distance metric
model_knn = KNeighborsClassifier(n_neighbors=17,metric='cosine')

print("----Recorded Metrics using distance metric-cosine for KNN---\n")

result_knn_df_cosine,model_knn = create_classification_model('KnearestNeighbor_cosine',model_knn,
                                                  scaled_X_train,
                                                  scaled_X_test,
                                                  y_train,
                                                  y_test)
----Recorded Metrics using distance metric-cosine for KNN---

Training_accuracy_score: 0.9609310058187863
Testing_accuracy_score: 0.9410774410774411
f1_score: 0.9404458620892752
roc_auc_score: 0.9956006984170029

---------Classification report----------

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        59
           1       0.81      0.98      0.89        60
           2       0.97      0.97      0.97        59
           3       0.98      0.97      0.97        60
           4       0.98      0.93      0.96        60
           5       0.92      0.95      0.93        60
           6       0.98      1.00      0.99        60
           7       0.88      0.98      0.93        59
           8       1.00      0.72      0.84        58
           9       0.95      0.92      0.93        59

    accuracy                           0.94       594
   macro avg       0.95      0.94      0.94       594
weighted avg       0.95      0.94      0.94       594

In [21]:
# using distance metrics as cityblock also known as mahattan distance when p =1
model_knn = KNeighborsClassifier(n_neighbors=17,p=1)

print('---# using distance metrics as cityblock also known as mahattan distance when p =1--\n')
result_knn_df_cityblock,model_knn = create_classification_model('KnearestNeighbor_cityblock',model_knn,
                                                  scaled_X_train,
                                                  scaled_X_test,
                                                  y_train,
                                                  y_test)
# grid.fit(scaled_X_train,y_train)
---# using distance metrics as cityblock also known as mahattan distance when p =1--

Training_accuracy_score: 0.9650872817955112
Testing_accuracy_score: 0.9629629629629629
f1_score: 0.9628919894095478
roc_auc_score: 0.9962049031204085

---------Classification report----------

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        59
           1       0.86      0.98      0.91        60
           2       1.00      0.98      0.99        59
           3       0.98      0.98      0.98        60
           4       0.98      0.98      0.98        60
           5       1.00      0.95      0.97        60
           6       0.98      0.98      0.98        60
           7       0.94      1.00      0.97        59
           8       0.96      0.83      0.89        58
           9       0.95      0.95      0.95        59

    accuracy                           0.96       594
   macro avg       0.97      0.96      0.96       594
weighted avg       0.97      0.96      0.96       594

In [22]:
df = result_knn_df_eucledian.append(result_knn_df_cosine).append(result_knn_df_cityblock)
df
Out[22]:
Model Training_accuracy_score Testing_accuracy_score roc_auc_score f1_score
0 KnearestNeighbor_eucledian 0.968412 0.968013 0.996296 0.967973
0 KnearestNeighbor_cosine 0.960931 0.941077 0.995601 0.940446
0 KnearestNeighbor_cityblock 0.965087 0.962963 0.996205 0.962892

ROC_AUC_score for above three models are quite similar
f1_score using cosine distance is less compared to similar performance acheived by cityblock and eucleadian
Similarly Testing_accuracy using cosine distance is less compared to similar performance acheived by cityblock and eucleadian


4. Creating random forest model using HR-Employee-Attrition.csv dataset and improve the result using hyperparameter tuning. Hints. Visualise your performance fluctuation for different hyperparameter values.¶

In [23]:
hr_df = pd.read_csv('HR-Employee-Attrition.csv')
hr_df.head()
Out[23]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [24]:
# Checking for null values
hr_df.isna().sum()
Out[24]:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64
In [25]:
# checking for number of unique values in features
for feature in hr_df.columns:
    if hr_df[feature].nunique() < 10:
        print(f"{feature} has {hr_df[feature].nunique()} unique values :-\n{hr_df[feature].unique()}\n")
    else:   
        print(f"{feature} has {hr_df[feature].nunique()} unique values\n")
Age has 43 unique values

Attrition has 2 unique values :-
['Yes' 'No']

BusinessTravel has 3 unique values :-
['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']

DailyRate has 886 unique values

Department has 3 unique values :-
['Sales' 'Research & Development' 'Human Resources']

DistanceFromHome has 29 unique values

Education has 5 unique values :-
[2 1 4 3 5]

EducationField has 6 unique values :-
['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']

EmployeeCount has 1 unique values :-
[1]

EmployeeNumber has 1470 unique values

EnvironmentSatisfaction has 4 unique values :-
[2 3 4 1]

Gender has 2 unique values :-
['Female' 'Male']

HourlyRate has 71 unique values

JobInvolvement has 4 unique values :-
[3 2 4 1]

JobLevel has 5 unique values :-
[2 1 3 4 5]

JobRole has 9 unique values :-
['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']

JobSatisfaction has 4 unique values :-
[4 2 3 1]

MaritalStatus has 3 unique values :-
['Single' 'Married' 'Divorced']

MonthlyIncome has 1349 unique values

MonthlyRate has 1427 unique values

NumCompaniesWorked has 10 unique values

Over18 has 1 unique values :-
['Y']

OverTime has 2 unique values :-
['Yes' 'No']

PercentSalaryHike has 15 unique values

PerformanceRating has 2 unique values :-
[3 4]

RelationshipSatisfaction has 4 unique values :-
[1 4 2 3]

StandardHours has 1 unique values :-
[80]

StockOptionLevel has 4 unique values :-
[0 1 3 2]

TotalWorkingYears has 40 unique values

TrainingTimesLastYear has 7 unique values :-
[0 3 2 5 1 4 6]

WorkLifeBalance has 4 unique values :-
[1 3 2 4]

YearsAtCompany has 37 unique values

YearsInCurrentRole has 19 unique values

YearsSinceLastPromotion has 16 unique values

YearsWithCurrManager has 18 unique values

In [26]:
# Checking for class imbalance
hr_df['Attrition'].value_counts(normalize=True) * 100
Out[26]:
No     83.877551
Yes    16.122449
Name: Attrition, dtype: float64

The data is imbalanced as number of Yes class is very low compared to No class

In [27]:
# Mapping yes & No classes into numerical format
att = {'Yes':1,'No':0}
hr_df['Attrition'] = hr_df['Attrition'].replace(att)
In [28]:
hr_df['Attrition'].value_counts(normalize=True) * 100
Out[28]:
0    83.877551
1    16.122449
Name: Attrition, dtype: float64
In [29]:
hr_df.head()
Out[29]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 0 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 1 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 0 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 0 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [30]:
hr_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   int64 
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(27), object(8)
memory usage: 402.1+ KB
In [31]:
# Converting categorical data into numerical format for training
df = pd.get_dummies(hr_df,drop_first=True)
df.head()
Out[31]:
Age Attrition DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement ... JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Married MaritalStatus_Single OverTime_Yes
0 41 1 1102 1 2 1 1 2 94 3 ... 0 0 0 0 0 1 0 0 1 1
1 49 0 279 8 1 1 2 3 61 2 ... 0 0 0 0 1 0 0 1 0 0
2 37 1 1373 2 2 1 4 4 92 2 ... 1 0 0 0 0 0 0 0 1 1
3 33 0 1392 3 4 1 5 4 56 3 ... 0 0 0 0 1 0 0 1 0 1
4 27 0 591 2 1 1 7 1 40 3 ... 1 0 0 0 0 0 0 1 0 0

5 rows × 48 columns

In [32]:
hr = df.copy()
In [33]:
X = df.drop('Attrition',axis = 1)
y = df['Attrition']
In [34]:
df.head(1)
Out[34]:
Age Attrition DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement ... JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Married MaritalStatus_Single OverTime_Yes
0 41 1 1102 1 2 1 1 2 94 3 ... 0 0 0 0 0 1 0 0 1 1

1 rows × 48 columns

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

print("training the dataset with default Randomforest settings-----\n\nGetting metrics on imbalanced dataset")
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
pred = rf.predict(X_test)
print(classification_report(y_test,pred))
training the dataset with default Randomforest settings-----

Getting metrics on imbalanced dataset
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       408
           1       0.56      0.13      0.21        78

    accuracy                           0.84       486
   macro avg       0.71      0.55      0.56       486
weighted avg       0.81      0.84      0.80       486

From above report , it is seen that recall score for 1(yes) class is very low because we have imbalance class dataset, we cannot rely on accuracy score
To tackle this situation, We will use SMOTE technique to create synthetic data to increase the minority class samples
Train the model using hyperparameters tuning on the new dataset,then check performace on the new dataset
Last we use this model trained on new synthetical created dataset on original imbalance dataset to check the performance

In [36]:
from imblearn.over_sampling import RandomOverSampler,SMOTE
In [37]:
y.value_counts()
Out[37]:
0    1233
1     237
Name: Attrition, dtype: int64
In [38]:
# creating Synthetic data 
smote = SMOTE(sampling_strategy='auto',random_state=1)
X_smote, y_smote = smote.fit_resample(X,y)
In [39]:
y.value_counts() , y_smote.value_counts()
Out[39]:
(0    1233
 1     237
 Name: Attrition, dtype: int64,
 1    1233
 0    1233
 Name: Attrition, dtype: int64)

Now Classes are balanced

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.33, random_state=42,stratify=y_smote)
In [41]:
# training a random forest on new dataset using hyperparameter tuning to get hyperparameters
rf_smote = RandomForestClassifier()
param_grid = {'n_estimators':[50,64,100,128,200],
             'criterion':['gini','entropy'],
             'max_depth':[3,4,5]}
rf_grid_smote = GridSearchCV(estimator=rf_smote,param_grid=param_grid,cv=5,verbose=1,return_train_score=True)

print('Recorded Metrics on synthetic data trained with RandomForest hyperparameters---\n')
result_smote_rf,rf_grid_smote = create_classification_model('RandomForest_smote',rf_grid_smote,
                                                  X_train,
                                                  X_test,
                                                  y_train,
                                                  y_test)
Recorded Metrics on synthetic data trained with RandomForest hyperparameters---

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Training_accuracy_score: 0.910411622276029
Testing_accuracy_score: 0.8734643734643734
f1_score: 0.8742368742368742
roc_auc_score: 0.939353693653448

---------Classification report----------

              precision    recall  f1-score   support

           0       0.88      0.87      0.87       407
           1       0.87      0.88      0.87       407

    accuracy                           0.87       814
   macro avg       0.87      0.87      0.87       814
weighted avg       0.87      0.87      0.87       814

In [42]:
result_smote_rf
Out[42]:
Model Training_accuracy_score Testing_accuracy_score roc_auc_score f1_score
0 RandomForest_smote 0.910412 0.873464 0.939354 0.874237
In [43]:
print(f"best parameters for RandomForest:\n{rf_grid_smote.best_params_}")
best parameters for RandomForest:
{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100}
In [44]:
rf_grid_smote_result = pd.DataFrame(rf_grid_smote.cv_results_)[['param_criterion','param_max_depth','param_n_estimators','params','mean_test_score','mean_train_score','rank_test_score']]
In [45]:
dff = rf_grid_smote_result.sort_values(by = 'rank_test_score')
dff.head()
Out[45]:
param_criterion param_max_depth param_n_estimators params mean_test_score mean_train_score rank_test_score
27 entropy 5 100 {'criterion': 'entropy', 'max_depth': 5, 'n_es... 0.876506 0.913135 1
25 entropy 5 50 {'criterion': 'entropy', 'max_depth': 5, 'n_es... 0.875300 0.909352 2
29 entropy 5 200 {'criterion': 'entropy', 'max_depth': 5, 'n_es... 0.874692 0.911471 3
13 gini 5 128 {'criterion': 'gini', 'max_depth': 5, 'n_estim... 0.874692 0.913590 3
14 gini 5 200 {'criterion': 'gini', 'max_depth': 5, 'n_estim... 0.873485 0.911774 5
In [46]:
# Visualizing the performance fluctuation for different hyperparameter values of RandomForest
import plotly.express as px
df = px.data.tips()
fig = px.line(dff, y="mean_test_score", 
              x=dff['params'].astype('str'),markers=True,
              height=1000,title='Performance fluctuation for different hyperparameter values of RandomForest')
fig.show()
In [47]:
# Using Trained Model on Old Imbalanced Dataset 
X = hr.drop('Attrition',axis =1)
y = hr['Attrition']

y_pred_whole = rf_grid_smote.predict(X)
print("Metrics recorded using model trained with hyperparameters on full original imabalnced dataset\n\n ")
print(f"accuracy_score: {accuracy_score(y,y_pred_whole)}")
print(f"f1_score: {f1_score(y,y_pred_whole)}")
print(f"roc_auc_score: {roc_auc_score(y,rf_grid_smote.predict_proba(X)[:,1])}")
print()
print(classification_report(y,y_pred_whole))
rf_classification = classification_report(y,y_pred_whole)
rf_imbalance_df = pd.DataFrame({'Model':['RandomForest_On_Imabalanced'],
                               'accuracy_score':[accuracy_score(y,y_pred_whole)],
                               'f1_score':[f1_score(y,y_pred_whole)],
                               'roc_auc_score':[roc_auc_score(y,rf_grid_smote.predict_proba(X)[:,1])]})
Metrics recorded using model trained with hyperparameters on full original imabalnced dataset

 
accuracy_score: 0.8564625850340136
f1_score: 0.5720081135902636
roc_auc_score: 0.8684967883896093

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1233
           1       0.55      0.59      0.57       237

    accuracy                           0.86      1470
   macro avg       0.74      0.75      0.74      1470
weighted avg       0.86      0.86      0.86      1470

From above classifcation report ::--
Recall score for class(1,yes) is improved significantly compared to previously recall score of less that 0.2

5. Creating GradientBoost model using HR-Employee-Attrition.csv dataset and improve the result using hyperparameter tuning. Hints.¶

In [48]:
X = hr.drop('Attrition',axis = 1)
y = hr['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
pred = gb.predict(X_test)
print("training the dataset with default GradientBoosting settings-----\n\nGetting metrics on imbalanced dataset")

print(classification_report(y_test,pred))
# gb_classification = classification_report(y_test,pred)
training the dataset with default GradientBoosting settings-----

Getting metrics on imbalanced dataset
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       408
           1       0.62      0.27      0.38        78

    accuracy                           0.86       486
   macro avg       0.75      0.62      0.65       486
weighted avg       0.83      0.86      0.83       486

From above report ,Similarly it is seen that recall score for 1(yes) class is very low because we have imbalance class dataset, we cannot rely on accuracy score
To tackle this situation, We will use SMOTE technique to create synthetic data to increase the minority class samples
Train the model using hyperparameters tuning on the new dataset,then check performace on the new dataset
Last we use this model trained on new synthetical created dataset on original imbalance dataset to check the performance

In [49]:
# training a GradientBoosting Algo on new dataset using hyperparameter tuning to get hyperparameters
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.33, random_state=42,stratify=y_smote)

gb_smote = GradientBoostingClassifier()
param_grid = {'n_estimators':[50,100],
             'learning_rate':[0.1,0.05,0.2],
             'max_depth':[3,4,5]}
gb_grid_smote = GridSearchCV(estimator=gb_smote,param_grid=param_grid,cv=5,verbose=1,return_train_score=True)
# rf_smote.fit(X_train,y_train)
# y_pred = rf_smote.predict(X_test)
# # y_pred[:5]
# print(classification_report(y_test,y_pred))
print('Recorded Metrics on synthetic data trained with GradientBoosting hyperparameters---\n')

result_smote_gb,gb_grid_smote = create_classification_model('GradientBoosting_smote',gb_grid_smote,
                                                  X_train,
                                                  X_test,
                                                  y_train,
                                                  y_test)
Recorded Metrics on synthetic data trained with GradientBoosting hyperparameters---

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Training_accuracy_score: 1.0
Testing_accuracy_score: 0.9115479115479116
f1_score: 0.9095477386934673
roc_auc_score: 0.9719346328682938

---------Classification report----------

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       407
           1       0.93      0.89      0.91       407

    accuracy                           0.91       814
   macro avg       0.91      0.91      0.91       814
weighted avg       0.91      0.91      0.91       814

In [50]:
print(f"best parameters for GradientBoosting:\n{gb_grid_smote.best_params_}")
best parameters for GradientBoosting:
{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100}
In [51]:
gb_grid_smote_result = pd.DataFrame(gb_grid_smote.cv_results_)[['param_learning_rate','param_max_depth','param_n_estimators','params','mean_test_score','mean_train_score','rank_test_score']]
# gb_grid_smote_result
In [52]:
dff = gb_grid_smote_result.sort_values(by = 'rank_test_score')
dff.head()
Out[52]:
param_learning_rate param_max_depth param_n_estimators params mean_test_score mean_train_score rank_test_score
15 0.2 4 100 {'learning_rate': 0.2, 'max_depth': 4, 'n_esti... 0.912216 1.000000 1
16 0.2 5 50 {'learning_rate': 0.2, 'max_depth': 5, 'n_esti... 0.911616 1.000000 2
17 0.2 5 100 {'learning_rate': 0.2, 'max_depth': 5, 'n_esti... 0.909800 1.000000 3
3 0.1 4 100 {'learning_rate': 0.1, 'max_depth': 4, 'n_esti... 0.909195 0.996217 4
14 0.2 4 50 {'learning_rate': 0.2, 'max_depth': 4, 'n_esti... 0.907981 0.996217 5
In [53]:
# Visualizing the performance fluctuation for different hyperparameter values of GradientBoosting

import plotly.express as px
df = px.data.tips()
fig = px.line(dff, y="mean_test_score", 
              x=dff['params'].astype('str'),markers=True,height=1000,
             title='Performance fluctuation for different hyperparameter values of GradientBoosting')
fig.show()
In [54]:
# Using Above Trained GradientBoosting Model on Old Imbalanced Dataset 
X = hr.drop('Attrition',axis =1)
y = hr['Attrition']

y_pred_whole = gb_grid_smote.predict(X)
print("Metrics recorded using Above Trained GradientBoosting Model with hyperparameters on full original imabalnced dataset\n\n ")
print(f"accuracy_score: {accuracy_score(y,y_pred_whole)}")
print(f"f1_score: {f1_score(y,y_pred_whole)}")
print(f"roc_auc_score: {roc_auc_score(y,gb_grid_smote.predict_proba(X)[:,1])}")
print()
print(classification_report(y,y_pred_whole))
gb_classification = classification_report(y,y_pred_whole)

gb_imbalance_df = pd.DataFrame({'Model':['GradientBoosting_On_Imabalanced'],
                               'accuracy_score':[accuracy_score(y,y_pred_whole)],
                               'f1_score':[f1_score(y,y_pred_whole)],
                               'roc_auc_score':[roc_auc_score(y,gb_grid_smote.predict_proba(X)[:,1])]})
Metrics recorded using Above Trained GradientBoosting Model with hyperparameters on full original imabalnced dataset

 
accuracy_score: 0.9551020408163265
f1_score: 0.8571428571428572
roc_auc_score: 0.974337915481776

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1233
           1       0.88      0.84      0.86       237

    accuracy                           0.96      1470
   macro avg       0.92      0.91      0.92      1470
weighted avg       0.95      0.96      0.95      1470

From above classifcation report ::--
Recall score for class(1,yes) is improved significantly compared to previously recall score of less that 0.3 on Imbalanced Data


6. Compare the best model after hyperparameter tuning found in Q4 and Q5, and explain which model is good and why.¶

In [55]:
print('Comparing RandomForest & GradientBoosting trained on Synthetic Created Data\n')
result_smote_rf.append(result_smote_gb)
Comparing RandomForest & GradientBoosting trained on Synthetic Created Data

Out[55]:
Model Training_accuracy_score Testing_accuracy_score roc_auc_score f1_score
0 RandomForest_smote 0.910412 0.873464 0.939354 0.874237
0 GradientBoosting_smote 1.000000 0.911548 0.971935 0.909548

From above comparison, GradientBoosting outperforms Random Forest but GradientBoosting seems to overfit the data,


In [56]:
print("Comparison of models trained on original full Imabalnced Data")
result = rf_imbalance_df.append(gb_imbalance_df)
result
Comparison of models trained on original full Imabalnced Data
Out[56]:
Model accuracy_score f1_score roc_auc_score
0 RandomForest_On_Imabalanced 0.856463 0.572008 0.868497
0 GradientBoosting_On_Imabalanced 0.955102 0.857143 0.974338
In [57]:
print(f"Classification report on Imabalanced Dataset of GradirantBoosting:\n\n{gb_classification}")
Classification report on Imabalanced Dataset of GradirantBoosting:

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1233
           1       0.88      0.84      0.86       237

    accuracy                           0.96      1470
   macro avg       0.92      0.91      0.92      1470
weighted avg       0.95      0.96      0.95      1470

In [58]:
print(f"Classification report on Imabalanced Dataset of RandomForest:\n\n{rf_classification}")
Classification report on Imabalanced Dataset of RandomForest:

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1233
           1       0.55      0.59      0.57       237

    accuracy                           0.86      1470
   macro avg       0.74      0.75      0.74      1470
weighted avg       0.86      0.86      0.86      1470

Also, GradientBoosting outperforms RandomForest on Imbalanced Dataset as GradientBoosting has good recall score for class(1,yes) > 0.8, as compared to class(1,yes) < 0.6 of RandomForest
From all the above results, GradientBoosting should be a preffered choice on RandomForest


Reference¶

https://hastie.su.domains/ISLP/ISLP_website.pdf
https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
https://scikit-learn.org/stable/modules/model_evaluation.html
https://www.analyticsvidhya.com/blog/2022/02/a-comprehensive-guide-on-hyperparameter-tuning-and-its-techniques/
https://www.datacamp.com/cheat-sheet
https://feature-engine.trainindata.com/en/1.3.x/user_guide/selection/index.html
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/
https://imbalanced-learn.org/stable/

In [ ]: